import numpy as np
np.random.seed(13) #TODO Check if this is used for sgd
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
from __future__ import division
import matplotlib.pyplot as plt
# DO NOT Modify the lines in this cell
path = 'alice.txt'
corpus = open(path).readlines()[0:700]
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
# Is this something they need to change?
dim = 100
window_size = 2 #use this window size for Skipgram, CBOW, and the model with the additional hidden layer
window_size_corpus = 4 #use this window size for the co-occurrence matrix
Use the provided code to load the "Alice in Wonderland" text document.
#create co-occurrence matrix
import pandas as pd
from sklearn import preprocessing
#dataset with unique words as index
words = list(tokenizer.word_index.keys())
matrix = pd.DataFrame(words, columns=['words'])
matrix.set_index('words', inplace=True)
columns = pd.DataFrame(columns=words)
matrix = pd.concat([matrix,columns])
matrix = matrix.reindex(columns.columns, axis=1)
matrix.fillna(0, inplace=True)
#inverse index to get word by code
inverse_index = dict((v,k) for k, v in tokenizer.word_index.items())
#compute score for every word-word couple
for line in corpus:
for i, word_code in enumerate(line):
word = inverse_index.get(word_code)
for j in range(max(0, i-window_size_corpus), min(len(line), i+window_size_corpus+1)):
if word_code != line[j]:
matrix[word][inverse_index.get(line[j])] += 1
#normalize each sample
normalized_values = preprocessing.normalize(matrix.values)
matrix = pd.DataFrame(normalized_values, index=matrix.index, columns=matrix.columns)
matrix.head()
#find cosine similarity to Alice, Dinah and Rabbit
alice_vector = matrix['alice'].values.reshape(1, -1)
rabbit_vector = matrix['rabbit'].values.reshape(1, -1)
dinah_vector = matrix['dinah'].values.reshape(1, -1)
alice_vs_rabbit = cosine_similarity(alice_vector, rabbit_vector)
alice_vs_dinah = cosine_similarity(alice_vector, dinah_vector)
dinah_vs_rabbit = cosine_similarity(dinah_vector, rabbit_vector)
print("Cosine similarity between Alice and Rabbit: " + str(alice_vs_rabbit[0][0]))
print("Cosine similarity between Alice and Dinah: " + str(alice_vs_dinah[0][0]))
print("Cosine similarity between Dinah and Rabbit: " + str(dinah_vs_rabbit[0][0]))
#find the closest words to Alice (nearest neighbors)
neigh = nn()
neigh.fit(matrix.values)
neighbors = neigh.kneighbors(alice_vector, 6)
for i, n in enumerate(neighbors[1][0]):
if matrix.index[n] != 'alice':
print("Word: {}\nDistance: {}\n".format(matrix.index[n], neighbors[0][0][i]))
The five closest words to "Alice" appear to be: "I", "she", "a", "very" and "it". It makes sense that the personal pronouns "she" and "i" are the closest ones to "Alice", since they are used in very similiar contexts and they are probably often followed by the same verbs. E.g. "Alice sees the rabbit", "she sees the rabbit" or "I see the rabbit", in direct dialogs. As for the other three words, they do not bring valuable information, as they are all quite common in the english language. A possible way to avoid this would be to remove stopwords from the corpus, in order to focus only on more meaningful terms.
Discussion of the drawbacks:
#Save your all the vector representations of your word embeddings in this way
#Change when necessary the sizes of the vocabulary/embedding dimension
f = open('vectors_co_occurrence.txt',"w")
f.write(" ".join([str(V-1),str(V-1)]))
f.write("\n")
#vectors = your word co-occurrence matrix
vectors = matrix.values
for i, word in enumerate(tokenizer.word_index.keys()):
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
#reopen your file as follows
co_occurrence = KeyedVectors.load_word2vec_format('./vectors_co_occurrence.txt', binary=False)
Build embeddings with a keras implementation where the embedding vector is of length 50, 150 and 300. Use the Alice in Wonderland text book for training.
#function definitions for CBOW
#generate data for CBOW
def generate_data_cbow(corpus, window_size, V):
maxlen = window_size*2
all_in = []
all_out = []
for words in corpus:
L = len(words)
for index, word in enumerate(words):
contexts = []
labels = []
s = index - window_size
e = index + window_size + 1
contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
labels.append(word)
all_in.append(sequence.pad_sequences(contexts, maxlen=maxlen))
all_out.append(np_utils.to_categorical(labels, V))
return (all_in,all_out)
#load the preprocessed CBOW data
def generate_data_cbow_from_file():
f = open('data_cbow.txt' ,'r')
for row in f:
inputs,outputs = row.split(",")
inputs = np.fromstring(inputs, dtype=int, sep=' ').reshape(1,-1)
outputs = np.fromstring(outputs, dtype=float, sep=' ').reshape(1,-1)
yield (inputs,outputs)
#prepare data for cbow
#get x and y's for data
x,y = generate_data_cbow(corpus,window_size,V)
#save the preprocessed data of CBOW
f = open('data_cbow.txt' ,'w')
for input,outcome in zip(x,y):
input = np.concatenate(input)
f.write(" ".join(map(str, list(input))))
f.write(",")
outcome = np.concatenate(outcome)
f.write(" ".join(map(str,list(outcome))))
f.write("\n")
f.close()
#create CBOW model
for dim in [50, 150, 300]:
#create model
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))
#define loss function for CBOW
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')
#train CBOW model
print("\nTraining cbow for dim="+str(dim))
for ite in range(10):
loss = 0.
for x, y in generate_data_cbow_from_file():
loss += cbow.train_on_batch(x, y)
print(ite, loss)
#save vector representation to file
f = open('vectors_cbow_'+str(dim)+'.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
#function definitions for Skipgram
#generate data for Skipgram
def generate_data_skipgram(corpus, window_size, V):
maxlen = window_size*2
all_in = []
all_out = []
for words in corpus:
L = len(words)
for index, word in enumerate(words):
p = index - window_size
n = index + window_size + 1
in_words = []
labels = []
for i in range(p, n):
if i != index and 0 <= i < L:
in_words.append([word])
labels.append(words[i])
if in_words != []:
all_in.append(np.array(in_words,dtype=np.int32))
all_out.append(np_utils.to_categorical(labels, V))
return (all_in,all_out)
#load the preprocessed Skipgram data
def generate_data_skipgram_from_file():
f = open('data_skipgram.txt' ,'r')
for row in f:
inputs,outputs = row.split(",")
inputs = np.fromstring(inputs, dtype=int, sep=' ')
inputs = np.asarray(np.split(inputs, len(inputs)))
outputs = np.fromstring(outputs, dtype=float, sep=' ')
outputs = np.asarray(np.split(outputs, len(inputs)))
yield (inputs,outputs)
#prepare data for Skipgram
#get x and y's for data
x,y = generate_data_skipgram(corpus,window_size,V)
#save the preprocessed data of Skipgram
f = open('data_skipgram.txt' ,'w')
for input,outcome in zip(x,y):
input = np.concatenate(input)
f.write(" ".join(map(str, list(input))))
f.write(",")
outcome = np.concatenate(outcome)
f.write(" ".join(map(str,list(outcome))))
f.write("\n")
f.close()
#create Skipgram model
for dim in [50, 150, 300]:
#create model
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))
#define loss function for Skipgram
skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')
#train skipgram model
print("\nTraining skipgram for dim="+str(dim))
for ite in range(10):
loss = 0.
for x, y in generate_data_skipgram_from_file():
loss += skipgram.train_on_batch(x, y)
print(ite, loss)
#save vector representation to file
f = open('vectors_skipgram_'+str(dim)+'.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")
vectors = skipgram.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
Hidden layer: activation function
In order to add non-linearity to the models, we choosed ReLU (Rectified Linear Unit) as the activation function for the hidden Dense layer.
#create CBOW model with additional dense layer
for dim in [50, 150, 300]:
#create model
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(256, activation="relu"))
cbow.add(Dense(units=V, kernel_initializer='uniform', activation='softmax'))
#define loss function for CBOW + dense
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')
#train CBOW + dense model
print("\nTraining cbow with dense layer for dim="+str(dim))
for ite in range(10):
loss = 0.
for x, y in generate_data_cbow_from_file():
loss += cbow.train_on_batch(x, y)
print(ite, loss)
#save vector representation to file
f = open('vectors_cbow_dense_'+str(dim)+'.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
#create Skipgram model with additional dense layer
for dim in [50, 150, 300]:
#create model
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(256, activation="relu"))
skipgram.add(Dense(units=V, kernel_initializer='uniform', activation='softmax'))
#define loss function for Skipgram + dense
skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')
#train model for Skipgram + dense
print("\nTraining skipgram with dense layer for dim="+str(dim))
for ite in range(10):
loss = 0.
for x, y in generate_data_skipgram_from_file():
loss += skipgram.train_on_batch(x, y)
print(ite, loss)
#save vector representation to file
f = open('vectors_skipgram_dense_'+str(dim)+'.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")
vectors = skipgram.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
# load word2vec models from files
skipgram_vectors_50 = KeyedVectors.load_word2vec_format('vectors_skipgram_50.txt', binary=False)
skipgram_vectors_150 = KeyedVectors.load_word2vec_format('vectors_skipgram_150.txt', binary=False)
skipgram_vectors_300 = KeyedVectors.load_word2vec_format('vectors_skipgram_300.txt', binary=False)
skipgram_dense_vectors_50 = KeyedVectors.load_word2vec_format('vectors_skipgram_dense_50.txt', binary=False)
skipgram_dense_vectors_150 = KeyedVectors.load_word2vec_format('vectors_skipgram_dense_150.txt', binary=False)
skipgram_dense_vectors_300 = KeyedVectors.load_word2vec_format('vectors_skipgram_dense_300.txt', binary=False)
cbow_vectors_50 = KeyedVectors.load_word2vec_format('vectors_cbow_50.txt', binary=False)
cbow_vectors_150 = KeyedVectors.load_word2vec_format('vectors_cbow_150.txt', binary=False)
cbow_vectors_300 = KeyedVectors.load_word2vec_format('vectors_cbow_300.txt', binary=False)
cbow_dense_vectors_50 = KeyedVectors.load_word2vec_format('vectors_cbow_dense_50.txt', binary=False)
cbow_dense_vectors_150 = KeyedVectors.load_word2vec_format('vectors_cbow_dense_150.txt', binary=False)
cbow_dense_vectors_300 = KeyedVectors.load_word2vec_format('vectors_cbow_dense_300.txt', binary=False)
#Implement your own analogy function
# analogy computed from exact matching
def analogy_matching(dictionary):
scores = []
f = open('analogy_alice.txt' ,'r')
total = .0
matches = .0
for row in f:
line = row.split()
if all(x in dictionary for x in line):
total += 1
A = np.array(dictionary[line[0]], dtype=float)
B = np.array(dictionary[line[1]], dtype=float)
C = np.array(dictionary[line[2]], dtype=float)
D = A - B + C
word = dictionary.wv.similar_by_vector(D, topn=1)[0][0]
if (word == line[3]):
matches += 1
return matches/total if total>0 else 0
# analogy compute from cosine similarity
def analogy_similarity(dictionary):
scores = []
f = open('analogy_alice.txt' ,'r')
for row in f:
line = row.split()
if all(x in dictionary for x in line):
A = np.array(dictionary[line[0]], dtype=float)
B = np.array(dictionary[line[1]], dtype=float)
C = np.array(dictionary[line[2]], dtype=float)
D = np.array(dictionary[line[3]], dtype=float)
scores.append(cosine_similarity((A - B + C).reshape(1, -1),D.reshape(1, -1))[0][0])
return sum(scores)/len(scores)
#Analogy with exact matching scores for Skipgram and CBOW
#skipgram scores
skipgram_50_matching_score = analogy_matching(skipgram_vectors_50)
print("Analogy with exact matching score for Skipgram model with dim=50: " + str(skipgram_50_matching_score))
skipgram_150_matching_score = analogy_matching(skipgram_vectors_150)
print("Analogy with exact matching score for Skipgram model with dim=150: " + str(skipgram_150_matching_score))
skipgram_300_matching_score = analogy_matching(skipgram_vectors_300)
print("Analogy with exact matching score for Skipgram model with dim=300: " + str(skipgram_300_matching_score))
print()
#skipgram + dense scores
skipgram_dense_50_matching_score = analogy_matching(skipgram_dense_vectors_50)
print("Analogy with exact matching score for Skipgram + dense model with dim=50: " + str(skipgram_dense_50_matching_score))
skipgram_dense_150_matching_score = analogy_matching(skipgram_dense_vectors_150)
print("Analogy with exact matching score for Skipgram + dense model with dim=150: " + str(skipgram_dense_150_matching_score))
skipgram_dense_300_matching_score = analogy_matching(skipgram_dense_vectors_300)
print("Analogy with exact matching score for Skipgram + dense model with dim=300: " + str(skipgram_dense_300_matching_score))
print()
#cbow scores
cbow_50_matching_score = analogy_matching(cbow_vectors_50)
print("Analogy with exact matching score for CBOW model with dim=50: " + str(cbow_50_matching_score))
cbow_150_matching_score = analogy_matching(cbow_vectors_150)
print("Analogy with exact matching score for CBOW model with dim=150: " + str(cbow_150_matching_score))
cbow_300_matching_score = analogy_matching(cbow_vectors_300)
print("Analogy with exact matching score for CBOW model with dim=300: " + str(cbow_300_matching_score))
print()
#skipgram + dense scores
cbow_dense_50_matching_score = analogy_matching(cbow_dense_vectors_50)
print("Analogy with exact matching score for CBOW + dense model with dim=50: " + str(cbow_dense_50_matching_score))
cbow_dense_150_matching_score = analogy_matching(cbow_dense_vectors_150)
print("Analogy with exact matching score for CBOW + dense model with dim=150: " + str(cbow_dense_150_matching_score))
cbow_dense_300_matching_score = analogy_matching(cbow_dense_vectors_300)
print("Analogy with exact matching score for CBOW + dense model with dim=300: " + str(cbow_dense_300_matching_score))
print()
#Analogy with cosine similariy scores for Skipgram and CBOW
#skipgram scores
skipgram_50_score = analogy_similarity(skipgram_vectors_50)
print("Analogy with cosine similariy score for Skipgram model with dim=50: " + str(skipgram_50_score))
skipgram_150_score = analogy_similarity(skipgram_vectors_150)
print("Analogy with cosine similariy score for Skipgram model with dim=150: " + str(skipgram_150_score))
skipgram_300_score = analogy_similarity(skipgram_vectors_300)
print("Analogy with cosine similariy score for Skipgram model with dim=300: " + str(skipgram_300_score))
print()
#skipgram + dense scores
skipgram_dense_50_score = analogy_similarity(skipgram_dense_vectors_50)
print("Analogy with cosine similariy score for Skipgram + dense model with dim=50: " + str(skipgram_dense_50_score))
skipgram_dense_150_score = analogy_similarity(skipgram_dense_vectors_150)
print("Analogy with cosine similariy score for Skipgram + dense model with dim=150: " + str(skipgram_dense_150_score))
skipgram_dense_300_score = analogy_similarity(skipgram_dense_vectors_300)
print("Analogy with cosine similariy score for Skipgram + dense model with dim=300: " + str(skipgram_dense_300_score))
print()
#cbow scores
cbow_50_score = analogy_similarity(cbow_vectors_50)
print("Analogy with cosine similariy score for CBOW model with dim=50: " + str(cbow_50_score))
cbow_150_score = analogy_similarity(cbow_vectors_150)
print("Analogy with cosine similariy score for CBOW model with dim=150: " + str(cbow_150_score))
cbow_300_score = analogy_similarity(cbow_vectors_300)
print("Analogy with cosine similariy score for CBOW model with dim=300: " + str(cbow_300_score))
print()
#skipgram + dense scores
cbow_dense_50_score = analogy_similarity(cbow_dense_vectors_50)
print("Analogy with cosine similariy score for CBOW + dense model with dim=50: " + str(cbow_dense_50_score))
cbow_dense_150_score = analogy_similarity(cbow_dense_vectors_150)
print("Analogy with cosine similariy score for CBOW + dense model with dim=150: " + str(cbow_dense_150_score))
cbow_dense_300_score = analogy_similarity(cbow_dense_vectors_300)
print("Analogy with cosine similariy score for CBOW + dense model with dim=300: " + str(cbow_dense_300_score))
print()
Comparison performance
We first tried to implement an analogy funtion in which the score is computed by counting the number of times in which the word returned by our models matches exactly the expected word. We observed that all the models scored 0, since the words were never totally accurate (some of them were close, but not identical, e.g. "see" and "sees"), thus not giving us helpful information to evaluate our models. For this reason we implemented a second function based on cosine similarity.
In this function, we test the analogy calculation of 3 terms, i.e. A-B+C=D, by taking the vector of the expected answer, i.e. D, from the tested model, and then calculating how close is it with the actual vector result of A-B+C, using cosine similarity. The analogies used in the calculation are taken from the file "analogy_alice.txt". The analogy score is then defined as the average of cosine similarity score between the expected and the actual vector of the analogy calculations.
Looking at the results above, we can conclude that a Skipgram model with 50 dimensions gives the best analogy results. It gives analogy score of around 17% - which is not very high - but it is the closest to 1, which means that more analogy calculation results are similar to the expected answer compared to the other Skipgram and CBOW models (with and without dense layers).
# Visualization of results
def tsne_plot(model, title):
"Creates and TSNE model and plots it"
labels = []
tokens = []
for word in model.wv.vocab:
tokens.append(model[word])
labels.append(word)
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=(12, 12))
for i in range(len(x)):
plt.scatter(x[i],y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.title(title)
plt.show()
# Visualize embeddings
tsne_plot(skipgram_vectors_50,"Skipgram (dim=50)")
tsne_plot(skipgram_vectors_150,"Skipgram (dim=150)")
tsne_plot(skipgram_vectors_300,"Skipgram (dim=300)")
tsne_plot(skipgram_dense_vectors_50,"Skipgram with dense layer (dim=50)")
tsne_plot(skipgram_dense_vectors_150,"Skipgram with dense layer (dim=150)")
tsne_plot(skipgram_dense_vectors_300,"Skipgram with dense layer (dim=300)")
tsne_plot(cbow_vectors_50,"CBOW (dim=50)")
tsne_plot(cbow_vectors_150,"CBOW (dim=150)")
tsne_plot(cbow_vectors_300,"CBOW (dim=300)")
tsne_plot(cbow_dense_vectors_50,"CBOW with dense layer (dim=50)")
tsne_plot(cbow_dense_vectors_150,"CBOW with dense layer (dim=150)")
tsne_plot(cbow_dense_vectors_300,"CBOW with dense layer (dim=300)")
Embedding visualization results
From the visualization plots it is possible to see that every model has a very different distribution of the vectors: the default Skipgram and CBOW models tend to have zones with a high concentration of words, while the vectors in the models with the dense layer are more sparse. Is it also noticeable that the norm of the vectors generally tends to decrease with the increasing of the dimension used for embedding, as we can see from the scale of the graphs.
One particular plot that stends out from the others is the one for the Skipgram model with dense layer and dimension 300: the vectors seem to be equally distributed over the graph. This will probably result in a low performace, since the information about similar words will hardly be accurate.
By observing some of the groups of neighbouring words, it seems that the best distance-similarity correlation is achieved in the two models with no hidden layer and with embedding dimension of 50.
#Comparison to the co-occurrance matrix embeddings
matrix_score = analogy_similarity(co_occurrence)
print("Analogy score for co-occurrence matrix: " + str(matrix_score))
#Visualize co-occurance matrix embeddings
tsne_plot(co_occurrence,"Co-occurrence matrix")
#Visualization results trained word embeddings
scores = pd.DataFrame(columns=["50","150","300"])
scores.loc["Skipgram"] = [skipgram_50_score, skipgram_150_score, skipgram_300_score]
scores.loc["Skipgram + dense"] = [skipgram_dense_50_score, skipgram_dense_150_score, skipgram_dense_300_score]
scores.loc["CBOW"] = [cbow_50_score, cbow_150_score, cbow_300_score]
scores.loc["CBOW + dense"] = [cbow_dense_50_score, cbow_dense_150_score, cbow_dense_300_score]
scores.loc["Co-occurrence matrix"] = [matrix_score, matrix_score, matrix_score]
display(scores)
scores = scores.T
scores = scores.reindex(scores.index.rename('Size'))
ax = scores.plot(figsize=(12,4))
ax.set_xticks(range(0, 3))
ax.set_xticklabels(scores.index)
plt.ylabel('Analogy score')
plt.legend(loc='best');
Interpretation results of the visualization
From the visualization above, we can see that the analogy score of Skipgram, and Skipgram models with dense layers initially have relatively big difference. However, the more dimensions are included in either types, the closer the analogy score seems to converge. The same thing also happens with CBOW models. In general, it seems like the normal Skipgram and CBOW models perform better than their dense-layered counterparts. It also seems like increasing dimensionality results in the worse performance for all models, except for CBOW with dense layers, which performs the best at 300 dimensions.
Compare the results of the trained word embeddings with the word-word co-occurrence matrix
Only the 50-dimensions Skipgram have better performance than the co-occurrence matrix. The co-occurrence matrix have analogy score of around 15% and the 50-dimensions Skipgram scored 17%. In exception to 150-dimensions Skipgram and 50-dimensions CBOW, the analogy scores of other models are quite close to 0. That means that for these models, most analogies in "analogy_alice.txt" give results that have no correlation to the expected result.
Discussion of the advantages of CBOW and Skipgram, the advantages of negative sampling and drawbacks of CBOW and Skipgram
Advantages:
Drawbacks:
Negative sampling:
#load pretrained word embeddings of word2vec
path_word2vec = "../GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(path_word2vec, binary=True)
#load pretraind word embeddings of Glove
from gensim.scripts import glove2word2vec
path = "../glove.6B/glove.6B.300d.txt"
#convert GloVe into word2vec format
num_vectors, num_dims = glove2word2vec.get_glove_info(path)
glove2word2vec.glove2word2vec(path, "glove_converted.txt");
#load glove from saved model
glove = KeyedVectors.load_word2vec_format("glove_converted.txt", binary=False)
glove_score = analogy_similarity(glove)
print("Analogy score for glove: " + str(glove_score))
word2vec_score = analogy_similarity(word2vec)
print("Analogy score for word2vec: " + str(word2vec_score))
#Visualize the pre-trained word embeddings
scores = pd.DataFrame(columns=['Analogy score', '# vectors'])
scores.loc["Skipgram 300"] = [skipgram_300_score, len(skipgram_vectors_300.index2word)]
scores.loc["Skipgram 300 + dense"] = [skipgram_dense_300_score, len(skipgram_dense_vectors_300.index2word)]
scores.loc["CBOW 300"] = [cbow_300_score, len(cbow_vectors_300.index2word)]
scores.loc["CBOW 300 + dense"] = [cbow_dense_300_score, len(cbow_dense_vectors_300.index2word)]
scores.loc["Glove"] = [glove_score, len(glove.index2word)]
scores.loc["word2vec"] = [word2vec_score, len(word2vec.index2word)]
display(scores)
scores.drop(columns=['# vectors'], inplace=True)
scores = scores.T
scores = scores.reindex(scores.index.rename('Model'))
ax = scores.plot(figsize=(8,6), kind='bar')
ax.set_xticks([])
plt.ylabel('Analogy score')
plt.legend(loc='best');
Comparison performance with your own trained word embeddings
As shown in the graph above, the performance of analogy calculation on Glove pre-trained model far exceeds our own trained models. The same goes with word2vec pre-trained model, which gives the second best analogy score.
From the table above the graph, we see that Glove has 400,000 vectors and word2vec has 3,000,000 vectors. In the mean time, our trained models only have 1,182 vectors. This means that the pre-trained models were trained on corpuses that contain more words than "Alice In the Wonderland", and therefore have more information on the analogy terms than our trained models. Therefore, it is not surprising that the pre-trained models give more accurate answers compared to our own trained models.